# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import JianshuItem
import time
import itertools
import sys
reload(sys)   
sys.setdefaultencoding('utf8') 

class JianshuspiderSpider(CrawlSpider):
    name = 'jianshuspider'
    allowed_domains = ['www.jianshu.com']
    start_urls = ['http://www.jianshu.com/recommendations/users']

    rules = (
        Rule(LinkExtractor(allow=r'/users/'), callback='parse_item', follow=True),
        Rule(LinkExtractor(allow=r'/u/'), callback='parse_item', follow=True),
            )


    def parse_item(self, response):
        item = JianshuItem()
        users_url = response.url
        #print users_url
        if 'follow'  not in users_url :
            if 'subscriptions' not in users_url:
                if 'liked' not in users_url:
                    if 'order_by' not in users_url:
                        if 'shared' not in users_url:
                            if 'timeline' not in users_url:
                                title = response.xpath('//div[@class="title"]/a[@class="name"]/text()').extract()
                                users_url = response.url
                                #关注人数、粉丝数量、文章数量
                                info = response.xpath('//div[@class="info"]/ul/li/div/a/p/text()').extract()
                                #print len(info)
                                #字数、收获喜欢数
                                info2 = response.xpath('//div[@class="info"]/ul/li/div/p/text()').extract()
                                intro = response.xpath('//div[@class="js-intro"]/text()').extract()
                                intro ="-*-".join(itertools.chain(intro)).replace(';','')

                                if len(info) == 3 and len(info2) == 2 and len(title) == 1 :
                                    print "Name:%s******URL:%s"%(title[0],users_url)
                                    #用户名
                                    item['name'] = title[0]
                                    #用户个人主页链接
                                    item['users_url'] = users_url
                                    #关注量
                                    item['attention'] = info[0]
                                    #粉丝数
                                    item['fans'] = info[1]
                                    #文章数
                                    item['article'] = info[2]
                                    #字数
                                    item['words_num'] = info2[0]
                                    #收获喜欢
                                    item['gain_like'] = info2[1]
                                    #个人介绍
                                    item['intro']  = intro
                                    #time.sleep(2)
                                return item
